build Bordes FB15k.py (3179B)
1 #!/usr/bin/env python2 2 3 from __future__ import print_function 4 import sys 5 import os 6 from log import log 7 8 urls = [ 'https://www.hds.utc.fr/everest/lib/exe/fetch.php?id=en%3Atranse&cache=cache&media=en:fb15k.tgz' ] 9 10 def get_archive(path): 11 import urllib 12 13 class URLopener(urllib.FancyURLopener): 14 def http_error_default(self, url, fp, errcode, errmsg, headers): 15 print('Error: {0} {1}'.format(errcode, errmsg), file=sys.stderr) 16 raise IOError 17 18 archive = path+'/archive.tgz' 19 downloaded = False 20 for url in urls: 21 log('Downloading dataset from "{0}"...'.format(url)) 22 try: 23 URLopener().retrieve(url, archive) 24 downloaded = True 25 log(' done\n') 26 except IOError: 27 pass 28 29 if not downloaded: 30 print('Error: Unable to download dataset.', file=sys.stderr) 31 sys.exit(1) 32 33 def get_raw(path): 34 if os.path.isdir(path+'/raw'): 35 return 36 37 get_archive(path) 38 39 log('Raw files not found, extracting archive...') 40 raw = path+'/raw' 41 os.mkdir(raw) 42 43 import tarfile 44 tar = tarfile.open(path+'/archive.tgz', 'r:gz') 45 tar.extractall(raw) 46 log(' done\n') 47 48 def compile_dataset(path): 49 get_raw(path) 50 prefix = path+'/raw/FB15k/freebase_mtr100_mte100-' 51 suffix = '.txt' 52 53 log('Reading train file...') 54 with open(prefix+'train'+suffix, 'r') as file: 55 content = map(lambda line: line.rstrip('\n').split('\t'), file.readlines()) 56 [left, relations, right] = map(set, zip(*content)) 57 entities = left | right 58 log(' done\n') 59 60 log('Writting entities...') 61 e2i, i2e, r2i, i2r = {}, {}, {}, {} 62 with open(path+'/entities', 'w') as file: 63 i=0 64 for entity in entities: 65 e2i[entity]=i 66 i2e[i]=entity 67 file.write(entity+'\n') 68 i+=1 69 log(' done ({0} entities written)\n'.format(i)) 70 71 log('Writting relations...') 72 with open(path+'/relations', 'w') as file: 73 i=0 74 for relation in relations: 75 r2i[relation]=i 76 i2r[i]=relation 77 file.write(relation+'\n') 78 i+=1 79 log(' done ({0} relations written)\n'.format(i)) 80 81 for name in ['train', 'valid', 'test']: 82 log('Compiling {0}...'.format(name)) 83 count = 0 84 with open(prefix+name+suffix, 'r') as infile: 85 with open(path+'/'+name, 'w') as outfile: 86 for line in infile.readlines(): 87 left, relation, right = line.rstrip('\n').split('\t') 88 if left in e2i and right in e2i and relation in r2i: 89 outfile.write('{0}\t{1}\t{2}\n'.format(e2i[left], r2i[relation], e2i[right])) 90 else: 91 count+=1 92 log(' done ({0} entit{1} removed)\n'.format(count, 'y' if count<2 else 'ies')) 93 94 if __name__ == '__main__': 95 if len(sys.argv)<2: 96 print('Usage: {0} path'.format(sys.argv[0]), file=sys.stderr) 97 sys.exit(1) 98 99 path = sys.argv[1] 100 if not os.path.isdir(path): 101 os.mkdir(path) 102 103 compile_dataset(path) 104 log('Bordes FB15k was successfully built in {0}\n'.format(path))